HW 02

Author

Weston Scott

1 - A new day, a new plot, a new geom

edibnb <- dsbox::edibnb
glimpse(edibnb)
Rows: 13,245
Columns: 10
$ id                   <dbl> 15420, 24288, 38628, 44552, 47616,…
$ price                <dbl> 80, 115, 46, 32, 100, 71, 175, 150…
$ neighbourhood        <chr> "New Town", "Southside", NA, "Leit…
$ accommodates         <dbl> 2, 4, 2, 2, 2, 3, 5, 5, 6, 10, 2, …
$ bathrooms            <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0,…
$ bedrooms             <dbl> 1, 2, 0, 1, 1, 1, 2, 3, 4, 4, 1, 1…
$ beds                 <dbl> 1, 2, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1…
$ review_scores_rating <dbl> 99, 92, 94, 93, 98, 97, 100, 92, 9…
$ number_of_reviews    <dbl> 283, 199, 52, 184, 32, 762, 7, 28,…
$ listing_url          <chr> "https://www.airbnb.com/rooms/1542…
summary(edibnb)
       id               price        neighbourhood     
 Min.   :   15420   Min.   :  0.00   Length:13245      
 1st Qu.:13279107   1st Qu.: 49.00   Class :character  
 Median :20171841   Median : 75.00   Mode  :character  
 Mean   :20077242   Mean   : 97.21                     
 3rd Qu.:27397925   3rd Qu.:110.00                     
 Max.   :36066014   Max.   :999.00                     
                    NA's   :199                        
  accommodates      bathrooms        bedrooms     
 Min.   : 1.000   Min.   :0.000   Min.   : 0.000  
 1st Qu.: 2.000   1st Qu.:1.000   1st Qu.: 1.000  
 Median : 3.000   Median :1.000   Median : 1.000  
 Mean   : 3.541   Mean   :1.226   Mean   : 1.583  
 3rd Qu.: 4.000   3rd Qu.:1.000   3rd Qu.: 2.000  
 Max.   :19.000   Max.   :9.000   Max.   :13.000  
                  NA's   :12      NA's   :4       
      beds        review_scores_rating number_of_reviews
 Min.   : 0.000   Min.   : 20.00       Min.   :  0.00   
 1st Qu.: 1.000   1st Qu.: 93.00       1st Qu.:  2.00   
 Median : 2.000   Median : 97.00       Median : 12.00   
 Mean   : 2.032   Mean   : 95.02       Mean   : 37.73   
 3rd Qu.: 3.000   3rd Qu.: 99.00       3rd Qu.: 45.00   
 Max.   :30.000   Max.   :100.00       Max.   :773.00   
 NA's   :15       NA's   :2177                          
 listing_url       
 Length:13245      
 Class :character  
 Mode  :character  
                   
                   
                   
                   
edibnb <- edibnb |>
    mutate(
        neighbourhood = fct_reorder(neighbourhood, 
                                    review_scores_rating, 
                                    .fun = median)
    ) |>
    filter(!is.na(neighbourhood)) |>
    glimpse()
Rows: 10,951
Columns: 10
$ id                   <dbl> 15420, 24288, 44552, 47616, 48645,…
$ price                <dbl> 80, 115, 32, 100, 71, 175, 150, 13…
$ neighbourhood        <fct> New Town, Southside, Leith, Souths…
$ accommodates         <dbl> 2, 4, 2, 2, 3, 5, 5, 6, 10, 2, 4, …
$ bathrooms            <dbl> 1.0, 1.5, 1.0, 1.0, 1.0, 1.0, 1.0,…
$ bedrooms             <dbl> 1, 2, 1, 1, 1, 2, 3, 4, 4, 1, 1, 1…
$ beds                 <dbl> 1, 2, 1, 1, 2, 3, 4, 5, 7, 1, 1, 1…
$ review_scores_rating <dbl> 99, 92, 93, 98, 97, 100, 92, 96, 9…
$ number_of_reviews    <dbl> 283, 199, 184, 32, 762, 7, 28, 222…
$ listing_url          <chr> "https://www.airbnb.com/rooms/1542…
ggplot(
    data = edibnb, 
    aes(
        x = review_scores_rating, 
        y = neighbourhood, 
        fill = neighbourhood
    )
) +
    
geom_density_ridges(
    scale = 2,
    rel_min_height = 0.01,
    legend.show = FALSE,
    alpha = 0.8
) +

scale_fill_viridis_d(
    option = "C", 
    begin = 0.1, 
    end = 0.9
) +

scale_y_discrete(expand = c(0, 0)) + 
scale_x_continuous(
    expand = c(0, 0.25),
    limits = c(90, 100)
) + 

coord_cartesian(clip = "off") +
labs(
    title = "Problem 1 - Ridgeline plot", 
    subtitle = "Airbnb listings: Edinburgh, Scotland",
    x = "Review Score Ratings",
    y = "Edinburgh\nNeighborhoods", 
    caption = "Source: Opensource dataset dsbox::edibnb"
) +

theme_ridges() +
theme(legend.position = "none")

Intepretation

The ridgeline plot above visualizes the distribution of Airbnb review scores across different Edinburgh neighborhoods, ordered by their respective median review scores. The neighborhoods with the highest overall reviews appear at the top of the plot with a descending order down the plot to the neighborhoods with the lowest review scores. Most review scores cluster tightly between 90 and 100 (x-axis), suggesting generally positive experiences overall in the set of reviews. However, some neighborhoods display broader distributions or lower medians. A broader distribution indicates that the reviews have a larger spread along the review spectrum.

2 - Foreign Connected PACs

# get a list of files with "Foreign Connected PAC" in their names
list_of_files <- dir_ls(path = "data", regexp = "Foreign Connected PAC")

# read all files and row bind them
# keeping track of the file name in a new column called year
pac <- read_csv(list_of_files, id = "year") |>
    glimpse()
Rows: 2,394
Columns: 6
$ year                               <chr> "data/Foreign Connec…
$ `PAC Name (Affiliate)`             <chr> "7-Eleven", "ABB Gro…
$ `Country of Origin/Parent Company` <chr> "Japan/Ito-Yokado", …
$ Total                              <chr> "$8500", "$46000", "…
$ Dems                               <chr> "$1500", "$17000", "…
$ Repubs                             <chr> "$7000", "$28500", "…

Found a reference to removing a column with select() (GeeksforGeeks 2023). Also used str_extract to filter the year (Tidyverse 2022).

pac <- pac |>
    clean_names() |>
    separate(
        country_of_origin_parent_company,
        into = c("country", "parent_company"),
        sep = "/", 
        remove = TRUE) |>

    mutate(
        year = str_extract(year, "\\d{4}-\\d{4}"),
        year = str_extract(year, "\\d{4}$"),
        year = as.integer(year)
    ) |>
    select(-total) |>
    glimpse()
Rows: 2,394
Columns: 6
$ year               <int> 2000, 2000, 2000, 2000, 2000, 2000, …
$ pac_name_affiliate <chr> "7-Eleven", "ABB Group", "Accenture"…
$ country            <chr> "Japan", "Switzerland", "UK", "UK", …
$ parent_company     <chr> "Ito-Yokado", "Asea Brown Boveri", "…
$ dems               <chr> "$1500", "$17000", "$23000", "$12500…
$ repubs             <chr> "$7000", "$28500", "$52984", "$26000…
pac <- pac |>
    pivot_longer(
        cols = c(dems, repubs),
        names_to = "party",
        values_to = "amount"
    ) |>

    mutate(
        amount = str_remove(amount, "\\$"),
        amount = as.integer(amount)
    ) |>
    glimpse()
Rows: 4,788
Columns: 6
$ year               <int> 2000, 2000, 2000, 2000, 2000, 2000, …
$ pac_name_affiliate <chr> "7-Eleven", "7-Eleven", "ABB Group",…
$ country            <chr> "Japan", "Japan", "Switzerland", "Sw…
$ parent_company     <chr> "Ito-Yokado", "Ito-Yokado", "Asea Br…
$ party              <chr> "dems", "repubs", "dems", "repubs", …
$ amount             <int> 1500, 7000, 17000, 28500, 23000, 529…
uk_spending <- pac |>
    filter(country == "UK") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>

    arrange(year, party) |> 
    glimpse()
Rows: 24
Columns: 3
$ year   <int> 2000, 2000, 2002, 2002, 2004, 2004, 2006, 2006, …
$ party  <chr> "dems", "repubs", "dems", "repubs", "dems", "rep…
$ totals <int> 975725, 2057518, 1046183, 2002772, 1188801, 2311…
ggplot(data = uk_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +

scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +

scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from UK-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +

theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0),
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

swiss_spending <- pac |>
    filter(country == "Switzerland") |>
    group_by(year, party) |>

    summarise(
        totals = sum(amount[amount != 0],
                     na.rm = TRUE), 
        .groups = "drop") |>
    arrange(year, party) |>
    glimpse()
Rows: 24
Columns: 3
$ year   <int> 2000, 2000, 2002, 2002, 2004, 2004, 2006, 2006, …
$ party  <chr> "dems", "repubs", "dems", "repubs", "dems", "rep…
$ totals <int> 361290, 710366, 508525, 722412, 627219, 1159776,…
ggplot(data = swiss_spending, 
       aes(x = year, y = totals / 1e6, color = party)) +

geom_line() +
scale_color_manual(values = c("repubs" = "red", 
                              "dems" = "blue"),
                  labels = c("Democrats", "Republicans")) +
scale_x_continuous(breaks = seq(1996, 2022, by = 4),
                   labels = seq(1996, 2022, by = 4)) +
scale_y_continuous(labels = function(x) paste0("$", round(x), "M")) +

labs(
    title = "Contributions to US political parties from Swiss-connected PACs",
    color = "Party",
    x = "Year",
    y = "Total amount",
    caption = "Source: OpenSecrets.org"
) +

theme(
    axis.title.y = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    axis.title.x = element_text(margin = margin(t = 0, r = 0, 
                                                b = 0, l = 0), 
                                hjust = 0),
    legend.position = c(0.87, 0.15)
)

Intepretation

Contributions from Swiss-connected PACs to U.S. political parties have grown since the year 2000. The peak around key election years. The data shows a clear preference for Republican candidates, especially from 2008 onward. This might reflect Swiss alignment of ideologies or policies with Republican platforms. In contrast, Democratic contributions also grew, though they remained more modest and stable over the yearly span of this dataset.

3 - Median housing prices in the US

median_housing <- read_csv("data/median-housing.csv")

median_housing <- median_housing |>
    rename(date = DATE) |>
    rename(price = MSPUS) |> 
    glimpse()
Rows: 234
Columns: 2
$ date  <date> 1963-01-01, 1963-04-01, 1963-07-01, 1963-10-01, …
$ price <dbl> 17800, 18000, 17900, 18500, 18500, 18900, 18900, …
recessions <- read_csv("data/recessions.csv")
glimpse(recessions)
Rows: 34
Columns: 2
$ Peak   <date> 1857-06-01, 1860-10-01, 1865-04-01, 1869-06-01,…
$ Trough <date> 1858-12-01, 1861-06-01, 1867-12-01, 1870-12-01,…
ggplot(data = median_housing, 
       aes(x = date, y = price)) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, 
                                         big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Source: Census; HUD"
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

recessions <- recessions |>
    mutate(
        is_recess = if_else(Peak >= as.Date("1963-01-01", 
                                            format = "%Y-%m-%d") & 
                            Trough <= as.Date("2021-04-01", 
                                              format = "%Y-%m-%d"),
                            TRUE, FALSE)
    ) |>
    filter(is_recess == TRUE)
glimpse(recessions)
Rows: 8
Columns: 3
$ Peak      <date> 1969-12-01, 1973-11-01, 1980-01-01, 1981-07-…
$ Trough    <date> 1970-11-01, 1975-03-01, 1980-07-01, 1982-11-…
$ is_recess <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE
ggplot(data = median_housing, 
       aes(x = date, 
           y = price)) +

geom_rect(
    data = recessions,
    aes(
        xmin = as.Date(Peak), 
        xmax = as.Date(Trough),
        ymin = -Inf, 
        ymax = Inf,
        y = NULL,
        x = NULL
    ), 
    fill = "cornsilk3"
    ) +

geom_line(color = "darkblue") +
scale_x_date(breaks = seq(as.Date("1960-01-01"), 
                          as.Date("2020-01-01"), 
                          by = "5 years"),
             labels = date_format("%Y")) +

scale_y_continuous(breaks = seq(0, 400000, by = 40000),
                   labels = label_number(accuracy = 1, big.mark = ",")) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
    caption = "Shaded areas indicate U.S. recessions\nSource: Census; HUD"
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank()
)

quarters <- median_housing |>
    mutate(
        year = year(date),
        quarter = paste0("Q", quarter(date)),
        ) |>
    arrange(date) |>
    filter(year %in% c(2019, 2020)) |> glimpse()
Rows: 8
Columns: 4
$ date    <date> 2019-01-01, 2019-04-01, 2019-07-01, 2019-10-01…
$ price   <dbl> 313000, 322500, 318400, 327100, 329000, 322600,…
$ year    <dbl> 2019, 2019, 2019, 2019, 2020, 2020, 2020, 2020
$ quarter <chr> "Q1", "Q2", "Q3", "Q4", "Q1", "Q2", "Q3", "Q4"
ggplot(data = quarters, 
       aes(x = date, 
           y = price,
          group = 1)) +

geom_line(color = "darkblue") +
geom_point(color = "darkblue", 
           size = 2, 
           shape = 21, 
           fill = "white") +

scale_y_continuous(breaks = seq(300000, 360000, by = 20000),
                   labels = label_comma()) +

scale_x_date(breaks = quarters$date,
             labels = quarters$quarter,
             expand = c(0.008, 0.008)) +

annotate("text", x = as.Date("2019-05-15"), y = 290000, label = "2019", size = 4) +
annotate("text", x = as.Date("2020-05-15"), y = 290000, label = "2020", size = 4) +

labs(
    title = "Median sales prices of houses sold in the United States",
    subtitle = "Not seasonally adjusted",
    x = NULL,
    y = "Dollars",
) +

theme(
    plot.title.position = "plot",
    panel.grid.minor.x = element_blank(),
    plot.margin = unit(c(1, 1, 2, 1), "lines"),
    axis.text.x = element_text(size = 8)
) +

coord_cartesian(ylim = c(300000, 360000), clip = "off")

4 - Expect More. Plot More.

Found a method for plotting circles in the library ggforce with geom_circle (jcblum and AdityaDey 2018), (Pedersen 2024).

target_data <- tibble(
    origin_x = 0,
    origin_y = 0,
    circle_radius = c(3, 2, 1),
    colors = c("red2", "white", "red2")
)
ggplot() +
    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[1], 
                    fill = colors[1]), 
                color = 'white', 
                size = 0.5) +

    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[2], 
                    fill = colors[2]), 
                color = 'white', 
                size = 0.5) +

    geom_circle(data = target_data, 
                aes(x0 = origin_x, 
                    y0 = origin_y, 
                    r = circle_radius[3], 
                    fill = colors[3]), 
                color = 'white', 
                size = 0.5) +

    scale_fill_identity() +
    geom_text(aes(x = 0, 
                  y = -4, 
                  label = "TARGET"), 
              size = 9, 
              fontface = "bold", 
              color = "red2") +

    geom_text(aes(x = 1.8, 
                  y = -4.2, 
                  label = "\U00AE"), ## googled the unicode escape sequence
              size = 7, 
              fontface = "bold", 
              color = "red2") +
    
    coord_fixed(ratio = 1) +
    theme_void() +
    theme(plot.margin = margin(20, 20, 40, 20))

5 - Mirror, mirror on the wall, who’s the ugliest of them all?

penguins |>
  glimpse()
Rows: 344
Columns: 8
$ species     <fct> Adelie, Adelie, Adelie, Adelie, Adelie, Ade…
$ island      <fct> Torgersen, Torgersen, Torgersen, Torgersen,…
$ bill_len    <dbl> 39.1, 39.5, 40.3, NA, 36.7, 39.3, 38.9, 39.…
$ bill_dep    <dbl> 18.7, 17.4, 18.0, NA, 19.3, 20.6, 17.8, 19.…
$ flipper_len <int> 181, 186, 195, NA, 193, 190, 181, 195, 193,…
$ body_mass   <int> 3750, 3800, 3250, NA, 3450, 3650, 3625, 467…
$ sex         <fct> male, female, female, NA, female, male, fem…
$ year        <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2…
ggplot(data = penguins,
       aes(x = bill_len, 
           y = body_mass, 
           color = island)) +

geom_point() +
labs(
    x = "Bill Length (mm)",
    y = "Body Mass (g)",
    title = "Penguin Body Mass Positively Correlates with Bill Length",
    subtitle = "By Island",
    color = "Island"
)

Found a useful source for changing background panel colors (R-charts 2023). I changed the background colors of most parts of the graph, including axis text. If you look closely, I also flipped the axis tick mark labels 180 degrees, as well as the x-axis label upside down. Why not?

ggplot(data = penguins,
       aes(x = bill_len, 
           y = body_mass, 
           color = island,
           shape = species)) +

geom_point(size = 10) +

scale_color_manual(values = c("magenta", "chartreuse", "yellow")) +
labs(
    x = "Bill Length (mm)",
    y = "Body Mass (g)",
    title = "Penguin Body Mass Positively Correlates with Bill Length",
    subtitle = "By Island",
    color = "Island",
    shape = "Species"
) +

theme_dark() +
theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.y = element_blank(),
    plot.background = element_rect(fill = "magenta"),         ## magenta background
    panel.background = element_rect(fill = "cyan"),           ## cyan panel
    panel.grid.major = element_line(color = "chartreuse"),    ## bright green gridlines
    panel.grid.minor = element_line(color = "chartreuse"),    ## bright green minor gridlines
    axis.text = element_text(color = "yellow", angle = 180),  ## yellow axis tick labels
    axis.title = element_text(color = "yellow", angle = 180), ## yellow axis titles
    legend.background = element_rect(fill = "chartreuse"),    ## bright green legend background
    legend.text = element_text(color = "magenta")             ## magenta legend text
)

References

GeeksforGeeks. 2023. “How to Remove a Column Using Dplyr Package in r.” https://www.geeksforgeeks.org/how-to-remove-a-column-using-dplyr-package-in-r/.
jcblum, and AdityaDey. 2018. “Circle in Ggplot2.” May 18, 2018. https://forum.posit.co/t/circle-in-ggplot2/8543.
Pedersen, Thomas Lin. 2024. “Geom_circle Function | Ggforce Package.” https://ggforce.data-imaginist.com/reference/geom_circle.html.
R-charts. 2023. “Ggplot2 Background Color.” https://r-charts.com/ggplot2/background-color/.
Tidyverse. 2022. “Str_extract: Extract Matching Patterns from a String.” https://stringr.tidyverse.org/reference/str_extract.html.